library(seqinr)
library(ggrepel)
library(ggplot2)
#load files 
SacCer = read.fasta("~/data/feature_file/SacCer3.fa")
SacCer2 = read.fasta("~/data/feature_file/sacCer2.fa")
brogaard_nucleosome = read.csv('~/data/feature_file/brogaard_nucleosome.csv')
cac_fast_nuc = read.csv('~/data/cac_fast_nuc.csv')
cac_slow_nuc = read.csv('~/data/cac_slow_nuc.csv')

###
#figure3B dinucleotide frequency 
###
cac_fast_nuc$dist_to_brogaard  = NA
cac_slow_nuc$dist_to_brogaard  = NA

cac_fast_nuc$brogaard_pos = NA
cac_slow_nuc$brogaard_pos = NA

#find the nearest Brogaard counterpart to each fast and slow nuc
for (i in 1:4163) {
  brogaard_sub = subset(brogaard_nucleosome,chr==cac_fast_nuc$chr[i])
  x = abs(brogaard_sub$pos - cac_fast_nuc$peak[i])
  cac_fast_nuc$dist_to_brogaard[i] = min(x)
  cac_fast_nuc$brogaard_pos[i] = brogaard_sub$pos[which(x==min(x))]
  
  
  brogaard_sub = subset(brogaard_nucleosome,chr==cac_slow_nuc$chr[i])
  x = abs(brogaard_sub$pos - cac_slow_nuc$peak[i])
  cac_slow_nuc$dist_to_brogaard[i] = min(x)
  cac_slow_nuc$brogaard_pos[i] = brogaard_sub$pos[which(x==min(x))]
  
  
}
fast_sub = subset(cac_fast_nuc,dist_to_brogaard < 50)
slow_sub = subset(cac_slow_nuc,dist_to_brogaard < 50)

#dinucleotide frequency 
fast.mtx = matrix(0,300,nrow =nrow(fast_sub))
for (i in 1:nrow(fast_sub)) {
  
  if((i %% 300) == 0 ){
    cat(paste("Finishing nucleosome #",i,"\n"))
  }
  
  c  = fast_sub$chr[i]
  p = ceiling(fast_sub$brogaard_pos[i])
  seq = SacCer2[[c]][(p-150):(p+150)]
  
  for (q in 1:300) {
    if(seq[q] =='a' & seq[q+1]=='t'){
      fast.mtx[i,q]=1
    }else if(seq[q] =='a' & seq[q+1]=='a'){
      fast.mtx[i,q]=1
    }else if(seq[q] =='t' & seq[q+1]=='t'){
      fast.mtx[i,q]=1
    }else if(seq[q] =='t' & seq[q+1]=='a'){
      fast.mtx[i,q]=1
    }
    
  }
}

slow.mtx = matrix(0,300,nrow =nrow( slow_sub))
for (i in 1:nrow(slow_sub)) {
  
  if((i %% 300) == 0 ){
    cat(paste("Finishing nucleosome #",i,"\n"))
  }
  
  c  = slow_sub$chr[i]
  p = ceiling(slow_sub$brogaard_pos[i])
  seq = SacCer2[[c]][(p-150):(p+150)]
  
  for (q in 1:300) {
    
    if(seq[q] =='a' & seq[q+1]=='t'){
      slow.mtx[i,q]=1
    }else if(seq[q] =='a' & seq[q+1]=='a'){
      slow.mtx[i,q]=1
    }else if(seq[q] =='t' & seq[q+1]=='t'){
      slow.mtx[i,q]=1
    }else if(seq[q] =='t' & seq[q+1]=='a'){
      slow.mtx[i,q]=1
    }
    
  }
}

slow_v = colSums(slow.mtx)
fast_v = colSums(fast.mtx)
x = stats::filter(seq(1,length(slow_v),1),rep(1,3)/3)[1:299]  #rolling average of 3bp
#plot figure 3B
png(file = '~/data/figure/figure3/figure3C.png', width = 5, height =4, units = "in", res = 300, bg = "transparent", type = "cairo-png" )
par(c(4,4,4,4))
plot(x,stats::filter(slow_v/nrow(slow_sub),rep(1,3)/3)[1:299],col=alpha('tomato3',0.6),lwd=2,type='l',ylab='AA/TT/AT/TA frequency',xlab='Distance from nucleosome dyad (bp)',ylim=c(0.25,0.53),xaxt='n')
axis(side = 1,at = c(50,100,150,200,250),labels = c(-100,-50,0,50,100))
lines(x,stats::filter(fast_v/nrow(fast_sub),rep(1,3)/3)[1:299],col=alpha('forestgreen',0.6),lwd=2)
legend('bottomright',c('Slow','Fast'),col=c('tomato3','forestgreen'),lty=c(1,1),lwd=c(2,2),box.lwd=0,bg = "transparent")
dev.off()

###
#figure3C count occurrence of all 6mers 
###
kmer.df=data.frame('kmer'=character(),'slow'=numeric())

k=6
for (i in 1:nrow(cac_slow_nuc)) {
  
  if((i %% 500) == 0 ){
    cat(paste("Finishing nucleosome #",i,"\n"))
  }
  
  c  = cac_slow_nuc$chr[i]
  p = ceiling(cac_slow_nuc$peak[i])
  seq = SacCer[[c]][(p-70):(p+70)]
  
  
  for(j in 1:(141-k+1)){
    seq_sub = paste(seq[j:(j+k-1)],collapse='')
    if(seq_sub %in% kmer.df$kmer){
      idx =which(kmer.df$kmer == seq_sub)
      kmer.df$slow[idx] = as.numeric(kmer.df$slow[idx])+1
    }else{
      kmer.df[nrow(kmer.df) + 1,] = c(seq_sub,1)
    }
    
  }
}
  
kmer.df$fast = 0

for (i in 1:nrow(cac_fast_nuc)) {
  
  if((i %% 500) == 0 ){
    cat(paste("Finishing nucleosome #",i,"\n"))
  }
  
  c  = cac_fast_nuc$chr[i]
  p = ceiling(cac_fast_nuc$peak[i])
  seq = SacCer[[c]][(p-70):(p+70)]
  
  
  for(j in 1:(141-k+1)){
    seq_sub = paste(seq[j:(j+k-1)],collapse='')
    if(seq_sub %in% kmer.df$kmer){
      idx =which(kmer.df$kmer == seq_sub)
      kmer.df$fast[idx] = as.numeric(kmer.df$fast[idx])+1
    }else{
      kmer.df[nrow(kmer.df) + 1,] = c(seq_sub,0,1)
    }
    
  }
}
kmer.df$slow = as.numeric(kmer.df$slow)
kmer.df$fast = as.numeric(kmer.df$fast)

#get the longest A or longest T in each 6mer
kmer.df$at_stretch = 0
for (i in 1:nrow(kmer.df)) {
  seq = kmer.df$kmer[i]
  longest_a = 0
  current_a = 0
  
  for (n in strsplit(seq,'')[[1]]) {
    if(n =='a'){
      current_a = current_a + 1
      if(current_a > longest_a){
        longest_a = current_a
      }
    }else{
      current_a = 0
    }
  }
  
  longest_t = 0
  current_t = 0
  
  for (n in strsplit(seq,'')[[1]]) {
    if(n =='t'){
      current_t = current_t + 1
      if(current_t > longest_t){
        longest_t = current_t
      }
    }else{
      current_t = 0
    }
  }
  kmer.df$at_stretch[i] = as.character(max(longest_t,longest_a))
}

#plot 3C
p=ggplot(kmer.df, aes(x = slow, y = fast,colour=at_stretch))+
  geom_point(size=1.3)+
  xlab('Sequence occurrence (slow)')+ylab('Sequence occurrence (fast)')+scale_colour_manual('Poly(dA:dT) length',values = rev(viridis(7)))+
  geom_text_repel(kmer.df %>% mutate(kmer = ifelse(slow > 500 & slow/fast > 6,toupper(kmer), "")),
                  mapping = aes(x=slow,y=fast,label = kmer), size =3,
                  box.padding = 0.7,
                  show.legend = FALSE,color='red3',max.overlaps =40) +
  theme_bw()+theme(panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(),
                   panel.background = element_rect(colour = "black", linewidth=0.8),
                   axis.text  = element_text(size=10,face='bold'),
                   axis.title = element_text(size=12))

ggsave('~/data/figure/figure3/figure3D.png',p,width = 6,height = 3,units = 'in',dpi=800)
 
png(file = '~/data/figure/figure3/figure3C_bg.png', width = 5, height =4, units = "in", res = 300, bg = "transparent", type = "cairo-png" )
par(c(4,4,4,4))
plot(c(0,2500),c(0,500),col='white',xlab = 'Sequence occurance (slow)',ylab='Sequence occurance (fast)')
dev.off()
